Chapter 5 Community composition

rm(list=ls()) #clear environment
load("data/squirrels_data.Rdata")

5.1 Count data preparation

genome_counts_log <- genome_counts %>% 
    column_to_rownames(var="genome") %>%
    mutate_all(~log10(.+1)) #fixed: mutate_at(vars(), ~log10(.+1))) was not working

genome_counts_pivot <- genome_counts %>%
  mutate_at(vars(-genome),~./sum(.)) %>% #apply TSS normalisation
  pivot_longer(-genome, names_to = "sample", values_to = "count") %>% #reduce to minimum number of columns
  left_join(., genome_metadata, by = join_by(genome == genome)) 
# %>% #append taxonomy
  # mutate(phylum = fct_relevel(phylum, rev(ehi_phylum_colors$phylum))) #sort phyla by taxonomy

genome_counts_by_host <- sample_metadata %>%
  select("sample","species","area_type", "development") %>%
  rename(host_sp=species) %>%
  left_join(genome_counts_pivot,., by=join_by("sample" == "sample")) #%>%

  
# Which host species each genome can be found in
genomes_by_species <- genome_counts_by_host %>%
  filter(count>0) %>%
  group_by(genome) %>%
  mutate(host = if_else(all(host_sp == "Sciurus vulgaris"), "only red",
                        if_else(all(host_sp == "Sciurus carolinensis"), "only grey", "both"))) %>%
  select(genome, host) %>%
  distinct(genome, .keep_all = TRUE) %>%
  left_join(.,genome_metadata, by='genome')

genomes_by_species$host <-factor(genomes_by_species$host, levels = c("both", "only red", "only grey"))

5.2 Genomes by host species

# Generate the phylum color heatmap
phylum_heatmap <- genome_metadata %>%
  arrange(match(genome, genome_tree$tip.label)) %>%
  select(genome,phylum) %>%
  mutate(phylum = factor(phylum, levels = unique(phylum))) %>%
  column_to_rownames(var = "genome")

# Create baseline circular genome tree
circular_tree <- force.ultrametric(genome_tree,method="extend") %>%
  ggtree(., layout = 'circular', size = 0.1, angle=45) +
  xlim(-1, NA)
***************************************************************
*                          Note:                              *
*    force.ultrametric does not include a formal method to    *
*    ultrametricize a tree & should only be used to coerce    *
*   a phylogeny that fails is.ultrametric due to rounding --  *
*    not as a substitute for formal rate-smoothing methods.   *
***************************************************************
# Add phylum colors ring
circular_tree <- gheatmap(circular_tree, phylum_heatmap, offset=0.0, width=0.3, colnames=FALSE) +
  scale_fill_manual(values=custom_colors, name="Phylum") +
  #geom_tiplab2(size=1, hjust=-0.1) +
  theme(legend.position = "right", plot.margin = margin(0, 0, 0, 0), panel.margin = margin(0, 0, 0, 0))


#Flush color scale to enable a new color scheme in the next ring
circular_tree <- circular_tree + new_scale_fill()

# Add host ring
circular_tree <-  circular_tree +
  new_scale_fill() +
  scale_fill_manual(values = c("black", "#ed2939", "#92a0ad"), name="Host\nspecies") + #"#cc3333", "#999999"
  geom_fruit(
    data=genomes_by_species,
    geom=geom_tile,
    mapping = aes(y=genome, fill=host),
    offset = 0.55,
    width=0.3)
  

#Plot circular tree
circular_tree

#number of MAGs by host species
genomes_by_species %>%
  dplyr::group_by(host) %>%
  summarise(n=length(host),
            percentage=(length(host)/1687)*100) %>%
  tt()
tinytable_4lb44wdbw55rvl4b9ysg
host n percentage
both 505 29.93480
only red 482 28.57143
only grey 700 41.49378
#most abundant phyla by host species
genome_counts_by_host %>%
  filter(count>0) %>%
  group_by(host_sp,phylum) %>%
  summarise(rel_abundance=sum(count)) %>%
  top_n(3, rel_abundance) %>%
  arrange(by_group=host_sp, desc(rel_abundance)) %>%
  tt()
tinytable_ubkqu672m9vg20dk9fyh
host_sp phylum rel_abundance
Sciurus carolinensis p__Bacillota_A 55.472233
Sciurus carolinensis p__Bacteroidota 16.149398
Sciurus carolinensis p__Bacillota 2.498674
Sciurus vulgaris p__Bacillota_A 48.125675
Sciurus vulgaris p__Bacteroidota 32.438965
Sciurus vulgaris p__Actinomycetota 16.517267
#most abundant class by host species
genome_counts_by_host %>%
  filter(count>0) %>%
  group_by(host_sp,class) %>%
  summarise(total_count=sum(count)) %>%
  top_n(5, total_count) %>%
  arrange(by_group=host_sp, desc(total_count)) %>%
  tt()
tinytable_3bmsqjasboh2mz48vi29
host_sp class total_count
Sciurus carolinensis c__Clostridia 55.472233
Sciurus carolinensis c__Bacteroidia 16.149398
Sciurus carolinensis c__Bacilli 2.498674
Sciurus carolinensis c__Verrucomicrobiae 2.094198
Sciurus carolinensis c__Vampirovibrionia 1.253040
Sciurus vulgaris c__Clostridia 48.125675
Sciurus vulgaris c__Bacteroidia 32.438965
Sciurus vulgaris c__Actinomycetia 16.040176
Sciurus vulgaris c__Bacilli 8.233587
Sciurus vulgaris c__Gammaproteobacteria 2.254878
#most common class by host species
genome_counts_by_host %>%
  #filter(count>0) %>%
  group_by(host_sp,class) %>%
  summarise(freq=sum(count>0)/n()) %>%
  top_n(3, freq) %>%
  arrange(by_group=host_sp, desc(freq)) %>%
  tt()
tinytable_c5uj7al4gs192x4731gz
host_sp class freq
Sciurus carolinensis c__Campylobacteria 0.4625000
Sciurus carolinensis c__Verrucomicrobiae 0.3875000
Sciurus carolinensis c__Peptococcia 0.3333333
Sciurus vulgaris c__Peptococcia 0.1666667
Sciurus vulgaris c__Campylobacteria 0.1454545
Sciurus vulgaris c__Negativicutes 0.1352273
#most abundant family by host species
genome_counts_by_host %>%
  filter(count>0) %>%
  group_by(host_sp,family) %>%
  summarise(abundance=sum(count)) %>%
  top_n(5, abundance) %>%
  arrange(by_group=host_sp, desc(abundance)) %>%
  tt()
tinytable_2icnqtbm9twdnalt743p
host_sp family abundance
Sciurus carolinensis f__Lachnospiraceae 29.253695
Sciurus carolinensis f__Oscillospiraceae 16.103469
Sciurus carolinensis f__Bacteroidaceae 9.370700
Sciurus carolinensis f__Muribaculaceae 4.737998
Sciurus carolinensis f__Borkfalkiaceae 2.527615
Sciurus vulgaris f__Lachnospiraceae 35.793100
Sciurus vulgaris f__Bacteroidaceae 21.149209
Sciurus vulgaris f__Bifidobacteriaceae 15.986288
Sciurus vulgaris f__Muribaculaceae 9.598616
Sciurus vulgaris f__Oscillospiraceae 4.267142
##FIX!!!
#most common family by host species
genome_counts_by_host %>%
  group_by(host_sp) %>%
  mutate(n = length(sample)) %>%
  ungroup() %>%
  # Group by host species and family to count the number of counts > 0
  group_by(host_sp, sample, family) %>%
  summarise(
    pos = sum(count > 0),
    n = first(n),
    freq = sum(count > 0) / first(n)
  ) %>%
  # Select the top 3 families by frequency for each host species
  group_by(host_sp) %>%
  top_n(3, freq) %>%
  # Sort the results
  arrange(host_sp, desc(freq)) %>%
  tt()
tinytable_kxfd06frdbed59vbx0yx
host_sp sample family pos n freq
Sciurus carolinensis EHI01467 f__Lachnospiraceae 161 134960 0.0011929461
Sciurus carolinensis EHI00382 f__Oscillospiraceae 142 134960 0.0010521636
Sciurus carolinensis EHI01480 f__Lachnospiraceae 138 134960 0.0010225252
Sciurus vulgaris EHI01458 f__Lachnospiraceae 91 185570 0.0004903810
Sciurus vulgaris EHI01512 f__Lachnospiraceae 91 185570 0.0004903810
Sciurus vulgaris EHI02304 f__Lachnospiraceae 71 185570 0.0003826049
vertical_tree <- force.ultrametric(genome_tree,method="extend") %>%
  ggtree(., size = 0.3)
***************************************************************
*                          Note:                              *
*    force.ultrametric does not include a formal method to    *
*    ultrametricize a tree & should only be used to coerce    *
*   a phylogeny that fails is.ultrametric due to rounding --  *
*    not as a substitute for formal rate-smoothing methods.   *
***************************************************************
#Add phylum colors
vertical_tree <- gheatmap(vertical_tree, phylum_heatmap, offset=0, width=0.1, colnames=FALSE, color=NA) +
  scale_fill_manual(values=custom_colors)

#Reset fill scale
vertical_tree <- vertical_tree + new_scale_fill()

#Add counts
vertical_tree <- gheatmap(vertical_tree, genome_counts_log, offset=0.5, width=3.5, color=NA, colnames=FALSE) + #, colnames_angle=90, font.size=2, colnames_position="top", colnames_offset_y = 9
  vexpand(.08) +
  coord_cartesian(clip = "off") +
  scale_fill_gradient(low = "#f4f4f4", high = "steelblue", na.value="white")

#Plot tree
vertical_tree +
  theme(legend.position='right')

5.3 Taxonomic composition of samples

# sample_sort <- sample_metadata %>%
#   arrange(Area_type) %>%
#   select(sample) %>%
#   pull()


# Plot stacked barplot
ggplot(genome_counts_by_host, aes(x=sample,y=count,fill=phylum, group=phylum))+ #grouping enables keeping the same sorting of taxonomic units
  geom_bar(stat="identity", colour="white", linewidth=0.02)+ #plot stacked bars with white borders
  scale_fill_manual(values=custom_colors, name="Phylum") +
  labs(y = "Relative abundance") +
  guides(fill = guide_legend(ncol = 1)) +
  facet_nested(~host_sp, scales="free", space="free") +
  theme(axis.text.x = element_blank(),
        axis.title.x = element_blank(),
        panel.background = element_blank(),
        panel.border = element_blank(),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        axis.line = element_line(linewidth = 0.5, linetype = "solid", colour = "black"),
        legend.position="right",
        )

phylum_summary <- genome_counts_by_host %>%
  group_by(sample,host_sp,phylum) %>%
  summarise(relabun=sum(count))

phylum_summary %>%
    group_by(phylum) %>%
    summarise(mean=mean(relabun, na.rm=TRUE),sd=sd(relabun, na.rm=TRUE)) 
# A tibble: 13 × 3
   phylum                    mean       sd
   <chr>                    <dbl>    <dbl>
 1 p__Actinomycetota    0.0898    0.234   
 2 p__Bacillota         0.0565    0.110   
 3 p__Bacillota_A       0.545     0.252   
 4 p__Bacillota_B       0.00255   0.00418 
 5 p__Bacillota_C       0.00429   0.00664 
 6 p__Bacteroidota      0.256     0.178   
 7 p__Campylobacterota  0.00521   0.0250  
 8 p__Cyanobacteriota   0.00895   0.0118  
 9 p__Elusimicrobiota   0.00170   0.00694 
10 p__Patescibacteria   0.0000251 0.000346
11 p__Pseudomonadota    0.0180    0.0738  
12 p__Thermoplasmatota  0.000351  0.00366 
13 p__Verrucomicrobiota 0.0116    0.0229  
phylum_arrange <- phylum_summary %>%
    group_by(phylum) %>%
    summarise(mean=mean(relabun)) %>%
    arrange(-mean) %>%
    select(phylum) %>%
    pull()

phylum_summary %>%
    filter(phylum %in% phylum_arrange) %>%
    mutate(phylum=factor(phylum,levels=rev(phylum_arrange))) %>%
    ggplot(aes(x=relabun, y=phylum, group=phylum, color=phylum)) +
        scale_color_manual(values=custom_colors, name="Phylum") +
        geom_jitter(alpha=0.5) + 
        facet_nested(~host_sp, scales="free", space="free") +
        theme_minimal() + 
        theme(legend.position="none") +
        labs(y="Phylum",x="Relative abundance") 

family_summary <- genome_counts_by_host %>%
  group_by(sample,host_sp,family) %>%
  summarise(relabun=sum(count))

family_summary %>%
    group_by(family) %>%
    summarise(mean=mean(relabun, na.rm=TRUE),sd=sd(relabun, na.rm=TRUE)) %>%
    arrange(-mean)
# A tibble: 74 × 3
   family                  mean     sd
   <chr>                  <dbl>  <dbl>
 1 f__Lachnospiraceae    0.342  0.192 
 2 f__Bacteroidaceae     0.161  0.146 
 3 f__Oscillospiraceae   0.107  0.104 
 4 f__Bifidobacteriaceae 0.0841 0.233 
 5 f__Muribaculaceae     0.0755 0.0956
 6 f__Ruminococcaceae    0.0329 0.0377
 7 f__Borkfalkiaceae     0.0192 0.0270
 8 f__Lactobacillaceae   0.0163 0.0514
 9 f__Streptococcaceae   0.0161 0.0866
10 f__Acutalibacteraceae 0.0109 0.0135
# ℹ 64 more rows
family_arrange <- family_summary %>%
    group_by(family) %>%
    summarise(mean=sum(relabun)) %>%
    arrange(-mean) %>%
    select(family) %>%
    pull()

family_summary %>%
    left_join(genome_metadata %>% select(family,phylum) %>% unique(),by=join_by(family==family)) %>%
    left_join(sample_metadata,by=join_by(sample==sample)) %>%
    filter(family %in% family_arrange[1:20]) %>%
    mutate(family=factor(family,levels=rev(family_arrange[1:20]))) %>%
    filter(relabun > 0) %>%
    ggplot(aes(x=relabun, y=family, group=family, color=phylum)) +
        scale_color_manual(values=custom_colors) +
        geom_jitter(alpha=0.5) + 
        facet_grid(.~host_sp)+
        theme_minimal() + 
        labs(y="Family", x="Relative abundance", color="Phylum") + 
    guides(colour = guide_legend(override.aes = list(size=4, alpha=0.9)))

#NB: there are several unnamed genera from different families that get grouped together

genus_summary <- genome_counts_by_host %>%
  group_by(sample,host_sp,genus) %>%
  summarise(relabun=sum(count))

genus_summary %>%
    group_by(genus) %>%
    summarise(mean=mean(relabun, na.rm=TRUE),sd=sd(relabun, na.rm=TRUE)) %>%
    arrange(-mean)
# A tibble: 348 × 3
   genus                mean     sd
   <chr>               <dbl>  <dbl>
 1 g__Bifidobacterium 0.0841 0.233 
 2 g__Prevotella      0.0732 0.0968
 3 g__Acetatifactor   0.0429 0.0621
 4 g__                0.0383 0.0427
 5 g__Eisenbergiella  0.0322 0.0556
 6 g__Faecousia       0.0270 0.0366
 7 g__CAG-95          0.0219 0.0326
 8 g__Roseburia       0.0215 0.0526
 9 g__Phocaeicola     0.0194 0.0292
10 g__Dysosmobacter   0.0179 0.0175
# ℹ 338 more rows
genus_arrange <- genus_summary %>%
    group_by(genus) %>%
    summarise(mean=sum(relabun)) %>%
    arrange(-mean) %>%
    select(genus) %>%
    pull()

genus_summary %>%
    left_join(genome_metadata %>% select(genus,phylum) %>% unique(),by=join_by(genus==genus)) %>%
    left_join(sample_metadata,by=join_by(sample==sample)) %>%
    filter(genus %in% genus_arrange[1:20]) %>%
    mutate(genus=factor(genus,levels=rev(genus_arrange[1:20]))) %>%
    filter(relabun > 0) %>%
    ggplot(aes(x=relabun, y=genus, group=genus, color=phylum)) +
        scale_color_manual(values=custom_colors) +
        geom_jitter(alpha=0.5) + 
        facet_grid(.~host_sp)+
        theme_minimal() + 
        labs(y="Genus", x="Relative abundance", color="Phylum") + 
    guides(colour = guide_legend(override.aes = list(size=5, alpha=0.9)))
Warning in left_join(., genome_metadata %>% select(genus, phylum) %>% unique(), : Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 1 of `x` matches multiple rows in `y`.
ℹ Row 41 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship = "many-to-many"` to silence this warning.